In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore

image.png

Description:¶

This datasets is related to red variants of the Portuguese "Vinho Verde" wine.The dataset describes the amount of various chemicals present in wine and their effect on it's quality. The datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are much more normal wines than excellent or poor ones).Your task is to predict the quality of wine using the given data.¶

A simple yet challenging project, to anticipate the quality of wine.The complexity arises due to the fact that the dataset has fewer samples, & is highly imbalanced.¶

In [2]:
df=pd.read_csv("WineQT.csv")
df
Out[2]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 0
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5 1
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5 2
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6 3
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1138 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6 1592
1139 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 6 1593
1140 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5 1594
1141 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6 1595
1142 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5 1597

1143 rows × 13 columns

This data frame contains the following columns:¶

Input variables (based on physicochemical tests):\¶

1 - fixed acidity\¶

2 - volatile acidity\¶

3 - citric acid\¶

4 - residual sugar\¶

5 - chlorides\¶

6 - free sulfur dioxide\¶

7 - total sulfur dioxide\¶

8 - density\¶

9 - pH\¶

10 - sulphates\¶

11 - alcohol\¶

Output variable (based on sensory data):\¶

12 - quality (score between 0 and 10)¶

In [3]:
df.drop('Id',axis=1,inplace=True)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 107.3 KB
In [5]:
df.isnull().sum()
Out[5]:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [36]:
sns.pairplot(df,hue='quality')
Out[36]:
<seaborn.axisgrid.PairGrid at 0x26ad170bee0>
In [41]:
plt.figure(figsize=(14,6))
corr=df.corr(method='pearson')
heatmap=sns.heatmap(corr,annot=True,vmax=1,vmin=-1,linewidths=1,linecolor='White')
plt.show()
In [6]:
for i in df.columns:
    sns.distplot(df[i])
    plt.show()
In [7]:
for i in df.columns:
    sns.boxplot(df[i])
    plt.show()
In [8]:
df.columns
Out[8]:
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
In [3]:
df[['sulphates', 'alcohol', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide']]=np.log(df[['sulphates', 'alcohol','residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide']])
In [4]:
df.drop(df[df['citric acid']==0.00].index,inplace=True)
In [5]:
df['citric acid'].value_counts()
Out[5]:
0.49    47
0.24    42
0.02    35
0.01    26
0.26    26
        ..
0.79     1
0.72     1
0.62     1
0.75     1
1.00     1
Name: citric acid, Length: 76, dtype: int64
In [6]:
df.apply(zscore)
Out[6]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
2 -0.363115 1.429920 -1.359801 -0.079212 0.364336 0.255610 0.529825 0.097443 -0.245963 0.038493 -0.578601 -0.834077 -1.732770
3 1.576848 -1.378363 1.426103 -0.614008 -0.254987 0.438930 0.679715 0.624490 -0.911038 -0.466849 -0.578601 0.397573 -1.730598
6 -0.306057 0.493826 -1.252651 -1.095047 -0.507753 0.255610 0.655805 -0.218785 0.020067 -1.494889 -0.998648 -0.834077 -1.724083
8 -0.363115 0.376814 -1.466952 -0.470430 -0.336923 -0.492571 -1.033108 -0.007966 0.419112 -0.543982 -0.891984 1.629223 -1.719740
9 -0.990750 0.376814 -1.145501 -0.765352 0.524767 0.255610 0.793588 -0.482309 -0.112948 -0.783770 -1.215424 -0.834077 -1.715397
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1138 -1.218981 -0.032727 -0.877626 -0.079212 -0.214835 1.221174 0.102883 -0.566636 0.818156 0.673145 0.585731 0.397573 1.719990
1139 -0.933692 0.610838 -1.145501 -0.614008 -0.552008 1.169778 0.029911 -0.160810 0.818156 1.068885 -0.891984 0.397573 1.722161
1140 -1.276038 0.493826 -1.145501 -0.470430 0.297709 1.365355 0.238476 -1.009356 1.017679 -0.466849 0.116824 -0.834077 1.724333
1141 -1.447212 0.201296 -1.038351 -0.203640 -0.832032 1.655100 0.448509 -0.893405 1.483231 0.731888 0.767352 0.397573 1.726505
1142 -1.447212 0.757102 -0.931201 -0.470430 -0.254987 1.365355 0.238476 -0.708939 1.815768 0.430071 -0.175360 -0.834077 1.730848

1044 rows × 13 columns

In [7]:
bins=(2,6.5,8)
group_names=['bad','good']
df['quality']=pd.cut(df['quality'],bins=bins,labels=group_names)
df['quality']=df['quality'].map({'bad':0,'good':1})
In [8]:
df['quality'].value_counts().plot.pie(autopct="%.2f%%")
Out[8]:
<AxesSubplot:ylabel='quality'>
In [11]:
df.head(5)
Out[11]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
2 7.8 0.76 0.04 0.832909 -2.385967 2.708050 3.988984 0.9970 3.26 -0.430783 2.282382 0
3 11.2 0.28 0.56 0.641854 -2.590267 2.833213 4.094345 0.9980 3.16 -0.544727 2.282382 0
6 7.9 0.60 0.06 0.470004 -2.673649 2.708050 4.077537 0.9964 3.30 -0.776529 2.240710 0
8 7.8 0.58 0.02 0.693147 -2.617296 2.197225 2.890372 0.9968 3.36 -0.562119 2.251292 1
9 6.7 0.58 0.08 0.587787 -2.333044 2.708050 4.174387 0.9959 3.28 -0.616186 2.219203 0
In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression
In [13]:
X=df.iloc[:,:-1]
y=df['quality']
In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=123)
In [15]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(835, 11) (835,)
(209, 11) (209,)
In [16]:
lr=LinearRegression()
lr.fit(X_train,y_train)
Out[16]:
LinearRegression()
In [17]:
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
In [18]:
print('RMSE',round(np.sqrt(metrics.mean_squared_error(y_train,y_train_pred)),2))
print('R2 SCORE',metrics.r2_score(y_train,y_train_pred))
RMSE 0.3
R2 SCORE 0.2564554498612728
In [19]:
print('RMSE',round(np.sqrt(metrics.mean_squared_error(y_test,y_test_pred)),2))
print('R2 SCORE',metrics.r2_score(y_test,y_test_pred))
RMSE 0.31
R2 SCORE 0.30266733125135226

KNN METHOD¶

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [21]:
X=df.iloc[:,:-1]
y=df['quality']
In [22]:
kn=KNeighborsClassifier()
kn.fit(X_train,y_train)
Out[22]:
KNeighborsClassifier()
In [23]:
y_train_pred=kn.predict(X_train)
y_test_pred=kn.predict(X_test)
In [24]:
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.911377245508982
0.8564593301435407
In [25]:
from sklearn.model_selection import cross_val_score
k_neighbors=list(range(1,50,2))
cv_scores=[]
for k in k_neighbors:
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
    cv_scores.append((scores.mean()))
In [26]:
mse=[1-x for x in cv_scores]
mse.index
Out[26]:
<function list.index(value, start=0, stop=9223372036854775807, /)>
In [27]:
optimal_k=k_neighbors[mse.index(min(mse))]
optimal_k
Out[27]:
1

LOGISTIC REGRESSION¶

In [28]:
from sklearn.metrics import accuracy_score,classification_report,recall_score,precision_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
In [29]:
lor=LogisticRegression()
lor.fit(X_train,y_train)
Out[29]:
LogisticRegression()
In [30]:
y_train_pred=lor.predict(X_train)
y_test_pred=lor.predict(X_test)
In [31]:
y_train_proba=lor.predict_proba(X_train)
y_test_proba=lor.predict_proba(X_test)
In [32]:
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.8658682634730539
0.8516746411483254
In [33]:
print(confusion_matrix(y_train,y_train_pred))
print(confusion_matrix(y_test,y_test_pred))
[[701  14]
 [ 98  22]]
[[174   0]
 [ 31   4]]
In [34]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       715
           1       0.61      0.18      0.28       120

    accuracy                           0.87       835
   macro avg       0.74      0.58      0.60       835
weighted avg       0.84      0.87      0.83       835

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       174
           1       1.00      0.11      0.21        35

    accuracy                           0.85       209
   macro avg       0.92      0.56      0.56       209
weighted avg       0.87      0.85      0.80       209

DECISION TREE¶

In [39]:
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
In [40]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
Out[40]:
DecisionTreeClassifier()
In [41]:
y_train_pred=dt.predict(X_train)
y_test_pred=dt.predict(X_test)
In [42]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       715
           1       1.00      1.00      1.00       120

    accuracy                           1.00       835
   macro avg       1.00      1.00      1.00       835
weighted avg       1.00      1.00      1.00       835

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       174
           1       0.58      0.54      0.56        35

    accuracy                           0.86       209
   macro avg       0.74      0.73      0.74       209
weighted avg       0.85      0.86      0.85       209

In [43]:
dt1=DecisionTreeClassifier(max_depth=10)
dt1.fit(X_train,y_train)
Out[43]:
DecisionTreeClassifier(max_depth=10)
In [44]:
y_train_pred=dt1.predict(X_train)
y_test_pred=dt1.predict(X_test)
In [45]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       715
           1       0.97      0.97      0.97       120

    accuracy                           0.99       835
   macro avg       0.98      0.98      0.98       835
weighted avg       0.99      0.99      0.99       835

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       174
           1       0.55      0.49      0.52        35

    accuracy                           0.85       209
   macro avg       0.72      0.70      0.71       209
weighted avg       0.84      0.85      0.84       209

In [46]:
fig,ax=plt.subplots(figsize=(10,10))
chart=plot_tree(dt1,max_depth=3,feature_names=X.columns,filled=True,fontsize=10)
In [ ]: